1 Set Working Directory

source("../shared/functions.R") 
here::i_am("rmd3/a1_preprocess.rmd")
## here() starts at /Users/wang.13246/Documents/Project/Huazhu_scRNAseq
print(paste("Current working directory:", here::here()))
## [1] "Current working directory: /Users/wang.13246/Documents/Project/Huazhu_scRNAseq"

2 1. Load All Raw Data

Load all 10X data from ../data_19072-23-26 directory.

data_dir <- "../data_19072-23-26"
sample_dirs <- list.files(data_dir)
data_dirs <- file.path(data_dir, sample_dirs, "filtered_feature_bc_matrix")
data_list <- list()

cat(paste("Loading from:", data_dir, "\n"))
## Loading from: ../data_19072-23-26
cat(paste("Found", length(data_dirs), "samples to load.\n\n"))
## Found 8 samples to load.
for (i in seq_along(data_dirs)){
  current_sample_name <- sample_dirs[i]
  cat(paste("Loading sample:", current_sample_name, "\n"))
  
  tmp <- Read10X(
    data_dirs[i],
    unique.features = TRUE,
    strip.suffix = FALSE
  )
  
  data_list[[i]] <- CreateSeuratObject(
    tmp,
    assay = "RNA",
    min.cells = 3,
    min.features = 200,
    project = current_sample_name
  )
  
  # Add QC metrics
  # Ribosomal genes
  rb.genes <- rownames(data_list[[i]])[grep("^Rp[sl][[:digit:]]", rownames(data_list[[i]]))]
  if (length(rb.genes) > 0) {
    percent.ribo <- colSums(GetAssayData(data_list[[i]], slot="counts")[rb.genes, ]) / 
      Matrix::colSums(GetAssayData(data_list[[i]], slot="counts")) * 100
    data_list[[i]] <- AddMetaData(data_list[[i]], percent.ribo, col.name = "percent.ribo")
  } else {
    data_list[[i]]$percent.ribo <- 0
  }
  
  # Mitochondrial genes
  data_list[[i]] <- PercentageFeatureSet(data_list[[i]], "^mt-", col.name = "percent.mito")
  
  # Hemoglobin genes (potential contamination)
  data_list[[i]] <- PercentageFeatureSet(data_list[[i]], "^Hb[^(p)]", col.name = "percent.hb")
  
  # Add sample name as metadata
  data_list[[i]]@meta.data$sample <- current_sample_name
}
## Loading sample: 19072-23-01-01-01
## Warning: The `slot` argument of `GetAssayData()` is deprecated as of SeuratObject 5.0.0.
## ℹ Please use the `layer` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Loading sample: 19072-23-02-01-01 
## Loading sample: 19072-23-03-01-01 
## Loading sample: 19072-23-04-01-01 
## Loading sample: 19072-26-01-01-01 
## Loading sample: 19072-26-02-01-01 
## Loading sample: 19072-26-03-01-01 
## Loading sample: 19072-26-04-01-01
# Get all loaded sample names
all_sample_names <- sapply(data_list, function(x) unique(x$sample))
names(data_list) <- all_sample_names

cat("\nAll loaded sample names:\n")
## 
## All loaded sample names:
print(all_sample_names)
## [1] "19072-23-01-01-01" "19072-23-02-01-01" "19072-23-03-01-01"
## [4] "19072-23-04-01-01" "19072-26-01-01-01" "19072-26-02-01-01"
## [7] "19072-26-03-01-01" "19072-26-04-01-01"
cat(paste("\nTotal samples loaded:", length(data_list), "\n"))
## 
## Total samples loaded: 8
rm(tmp, rb.genes, percent.ribo, i, current_sample_name, data_dirs, sample_dirs)
gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used   (Mb)
## Ncells   8686806  464.0   14174291  757.0         NA   14174291  757.0
## Vcells 441907191 3371.5 1038783050 7925.3     102400 1033656499 7886.2

3 2. Define scuttle::isOutlier QC Function

This function uses scuttle::isOutlier for automated, data-driven QC thresholds.

automated_qc_filtering <- function(seurat_obj, sample_name, nmads = 3) {
  cat(paste("\n  Processing", sample_name, "with nmads =", nmads, "...\n"))
  
  df <- seurat_obj@meta.data
  
  # Identify outliers for each metric
  # Library size (nCount_RNA) - both directions, log scale
  outlier_libsize <- isOutlier(df$nCount_RNA, type = "both", log = TRUE, nmads = nmads)
  
  # Number of features (nFeature_RNA) - both directions, log scale
  outlier_features <- isOutlier(df$nFeature_RNA, type = "both", log = TRUE, nmads = nmads)
  
  # Mitochondrial percentage - higher direction only
  outlier_mito <- isOutlier(df$percent.mito, type = "higher", nmads = nmads)
  
  # Combine all outlier criteria
  outlier_combined <- outlier_libsize | outlier_features | outlier_mito
  
  # Extract thresholds
  thresh_libsize <- attr(outlier_libsize, "thresholds")
  thresh_features <- attr(outlier_features, "thresholds")
  thresh_mito <- attr(outlier_mito, "thresholds")
  
  cat("    Adaptive thresholds calculated:\n")
  cat("      nCount_RNA: [", round(thresh_libsize["lower"]), " - ", 
      round(thresh_libsize["higher"]), "]\n", sep = "")
  cat("      nFeature_RNA: [", round(thresh_features["lower"]), " - ", 
      round(thresh_features["higher"]), "]\n", sep = "")
  cat("      percent.mito: < ", round(thresh_mito["higher"], 2), "\n", sep = "")
  
  cat("\n    Outliers identified:\n")
  cat("      Library size (low/high UMI):", sum(outlier_libsize), 
      "(", round(sum(outlier_libsize)/nrow(df)*100, 1), "%)\n")
  cat("      Feature count (low/high genes):", sum(outlier_features), 
      "(", round(sum(outlier_features)/nrow(df)*100, 1), "%)\n")
  cat("      High mitochondrial:", sum(outlier_mito), 
      "(", round(sum(outlier_mito)/nrow(df)*100, 1), "%)\n")
  cat("      TOTAL outliers:", sum(outlier_combined), 
      "(", round(sum(outlier_combined)/nrow(df)*100, 1), "%)\n")
  cat("      Cells to KEEP:", sum(!outlier_combined), 
      "(", round(sum(!outlier_combined)/nrow(df)*100, 1), "%)\n")
  
  # Add outlier flags to metadata
  seurat_obj$outlier_libsize <- outlier_libsize
  seurat_obj$outlier_features <- outlier_features
  seurat_obj$outlier_mito <- outlier_mito
  seurat_obj$outlier_combined <- outlier_combined
  
  # Store thresholds
  seurat_obj@misc$qc_thresholds <- list(
    libsize_lower = thresh_libsize["lower"],
    libsize_higher = thresh_libsize["higher"],
    features_lower = thresh_features["lower"],
    features_higher = thresh_features["higher"],
    mito_higher = thresh_mito["higher"]
  )
  
  return(seurat_obj)
}

4 3. Define Processing Function with Harmony

process_project <- function(seurat_list, project_name, sample_metadata_df, output_file,
                            nmads = 3,
                            run_harmony = TRUE,
                            harmony_group_by = "sample") {
  
  cat(paste("\n========================================\n"))
  cat(paste("Processing Project:", project_name, "\n"))
  cat(paste("========================================\n"))
  
  # 1. Apply automated QC to each sample
  cat("\n--- Step 1: Automated QC with scuttle::isOutlier ---\n")
  for (i in seq_along(seurat_list)) {
    sample_name <- unique(seurat_list[[i]]$sample)
    seurat_list[[i]] <- automated_qc_filtering(seurat_list[[i]], sample_name, nmads = nmads)
  }
  
  # 2. Pre-filtering summary
  cat("\n--- Step 2: Pre-filtering QC Summary ---\n")
  pre_filter_counts <- sapply(seurat_list, ncol)
  names(pre_filter_counts) <- sapply(seurat_list, function(x) unique(x$sample))
  print(data.frame(Sample = names(pre_filter_counts), Cells_Before = pre_filter_counts))
  
  # 3. Filter outliers
  cat("\n--- Step 3: Filtering Outlier Cells ---\n")
  for (i in seq_along(seurat_list)) {
    sample_name <- unique(seurat_list[[i]]$sample)
    n_before <- ncol(seurat_list[[i]])
    seurat_list[[i]] <- subset(seurat_list[[i]], subset = outlier_combined == FALSE)
    n_after <- ncol(seurat_list[[i]])
    cat(paste(" ", sample_name, ":", n_before, "->", n_after, "cells\n"))
  }
  
  # 4. Merge objects
  cat("\n--- Step 4: Merging Samples ---\n")
  if (length(seurat_list) > 1) {
    sample_names_for_merge <- sapply(seurat_list, function(x) unique(x$sample))
    combined <- merge(x = seurat_list[[1]], 
                      y = seurat_list[2:length(seurat_list)], 
                      add.cell.ids = sample_names_for_merge)
  } else {
    combined <- seurat_list[[1]]
  }
  
  # Fix sample column from orig.ident
  if ("orig.ident" %in% colnames(combined@meta.data)) {
    combined$sample <- combined$orig.ident
  }
  
  cat(paste("Total cells after filtering:", ncol(combined), "\n"))
  
  # 5. Add project metadata
  cat("\n--- Step 5: Adding Project Metadata ---\n")
  original_meta <- combined@meta.data
  original_meta$orig.ident_join_key <- original_meta$orig.ident
  sample_metadata_df$orig.ident_join_key <- sample_metadata_df$sample
  
  new_meta <- dplyr::left_join(original_meta, 
                               sample_metadata_df[, !colnames(sample_metadata_df) == "sample"], 
                               by = "orig.ident_join_key")
  rownames(new_meta) <- rownames(original_meta)
  combined@meta.data <- new_meta
  
  new_cols <- colnames(sample_metadata_df)[!colnames(sample_metadata_df) %in% c("sample", "orig.ident_join_key")]
  for(col in new_cols) {
    cat(paste("--- Table for:", col, "---\n"))
    print(table(combined@meta.data[, col], useNA = "ifany"))
  }
  
  # 6. Post-filtering summary
  cat("\n--- Step 6: Post-filtering Statistics ---\n")
  qc_summary <- combined@meta.data %>%
    group_by(sample) %>%
    summarise(
      n_cells = n(),
      median_nFeature = median(nFeature_RNA),
      median_nCount = median(nCount_RNA),
      median_mito = round(median(percent.mito), 2),
      .groups = "drop"
    )
  print(qc_summary)
  
  # 7. QC Violin plots
  cat("\n--- Step 7: Generating QC Plots ---\n")
  p_qc <- VlnPlot(combined, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"),
                  group.by = "sample", pt.size = 0, ncol = 3) +
    plot_annotation(title = paste(project_name, "- QC After Filtering"))
  print(p_qc)
  
  # 8. Standard Seurat workflow
  cat("\n--- Step 8: Running Seurat Analysis Pipeline ---\n")
  cat("  Normalizing...\n")
  combined <- NormalizeData(combined, normalization.method = "LogNormalize", 
                            scale.factor = 10000, verbose = FALSE)
  
  cat("  Finding variable features...\n")
  combined <- FindVariableFeatures(combined, selection.method = "vst", 
                                   nfeatures = 3000, verbose = FALSE)
  
  cat("  Scaling data (regressing out percent.mito)...\n")
  combined <- ScaleData(combined, vars.to.regress = "percent.mito", verbose = FALSE)
  
  cat("  Running PCA...\n")
  combined <- RunPCA(combined, npcs = 50, verbose = FALSE)
  
  # Elbow plot
  p_elbow <- ElbowPlot(combined, ndims = 50) + ggtitle(paste(project_name, "- PCA Elbow Plot"))
  print(p_elbow)
  
  reduction_to_use <- "pca"
  
  # 9. Harmony batch correction
  if (run_harmony && length(unique(combined@meta.data[[harmony_group_by]])) > 1) {
    cat("\n--- Step 9: Running Harmony Batch Correction ---\n")
    combined <- harmony::RunHarmony(
      combined,
      group.by.vars = harmony_group_by,
      assay.use = "RNA",
      verbose = FALSE
    )
    reduction_to_use <- "harmony"
    cat("  Using Harmony reduction\n")
  } else {
    cat("\n--- Step 9: Skipping Harmony (single sample or disabled) ---\n")
  }
  
  # 10. UMAP and clustering
  cat("\n--- Step 10: Running UMAP and Clustering ---\n")
  n_dims <- 30
  combined <- RunUMAP(combined, reduction = reduction_to_use, dims = 1:n_dims, verbose = FALSE)
  combined <- FindNeighbors(combined, reduction = reduction_to_use, dims = 1:n_dims, verbose = FALSE)
  
  # Test multiple resolutions
  resolutions <- c(0.1, 0.2, 0.3, 0.5, 0.8, 1.0)
  for (res in resolutions) {
    combined <- FindClusters(combined, resolution = res, verbose = FALSE)
  }
  
  cat("  Cluster counts:\n")
  for (res in resolutions) {
    col_name <- paste0("RNA_snn_res.", res)
    n_clusters <- length(unique(combined@meta.data[[col_name]]))
    cat("    Resolution", res, ":", n_clusters, "clusters\n")
  }
  
  Idents(combined) <- "RNA_snn_res.0.3"
  
  # 11. Visualizations
  cat("\n--- Step 11: Generating UMAP Plots ---\n")
  
  p_cluster <- DimPlot(combined, group.by = "RNA_snn_res.0.3", label = TRUE, label.size = 5) +
    ggtitle(paste(project_name, "- Clusters (Resolution 0.3)"))
  
  p_sample <- DimPlot(combined, group.by = "sample", label = FALSE, pt.size = 0.3) +
    ggtitle(paste(project_name, "- Sample Distribution"))
  
  print(p_cluster | p_sample)
  
  # Split by sample
  p_split <- DimPlot(combined, split.by = "sample", label = TRUE, ncol = 2) +
    ggtitle(paste(project_name, "- Clusters Split by Sample"))
  print(p_split)
  
  # QC on UMAP
  p_qc_umap <- FeaturePlot(combined, features = c("nFeature_RNA", "nCount_RNA", 
                                                   "percent.mito", "percent.ribo"), ncol = 2)
  print(p_qc_umap)
  
  # Plot by metadata groups
  for (group_name in new_cols) {
    p_group <- DimPlot(combined, group.by = group_name, label = TRUE, pt.size = 0.3, 
                       repel = TRUE, label.box = TRUE) +
      ggtitle(paste(project_name, "-", group_name))
    print(p_group)
  }
  
  # 12. Check marker genes
  cat("\n--- Step 12: Checking Marker Genes ---\n")
  markers <- c(
    "Tnnt2", "Myh6", "Myh7",    # Cardiomyocytes
    "Col1a1", "Col3a1", "Dcn",  # Fibroblasts
    "Pecam1", "Cdh5", "Vwf",    # Endothelial
    "Ptprc", "Cd68",            # Immune
    "Acta2", "Myh11"            # Smooth muscle
  )
  
  markers_present <- markers[markers %in% rownames(combined)]
  cat("  Markers found:", paste(markers_present, collapse = ", "), "\n")
  
  if (length(markers_present) > 0) {
    p_markers <- FeaturePlot(combined, features = markers_present[1:min(9, length(markers_present))],
                              ncol = 3, order = TRUE)
    print(p_markers)
    
    p_dot <- DotPlot(combined, features = markers_present, group.by = "RNA_snn_res.0.3") +
      RotatedAxis() +
      ggtitle(paste(project_name, "- Marker Expression by Cluster"))
    print(p_dot)
  }
  
  # 13. Save
  cat(paste("\n--- Step 13: Saving processed object to:", output_file, "---\n"))
  qs::qsave(combined, output_file)
  
  cat(paste("\n========================================\n"))
  cat(paste("Finished Project:", project_name, "\n"))
  cat(paste("Total cells:", ncol(combined), "\n"))
  cat(paste("========================================\n\n"))
  
  return(combined)
}

5 4. Project 1: 19072-23

Samples: - 19072-23-01-01-01 - 19072-23-02-01-01 - 19072-23-03-01-01 - 19072-23-04-01-01

proj23_sample_names <- c("19072-23-01-01-01", "19072-23-02-01-01", 
                         "19072-23-03-01-01", "19072-23-04-01-01")

proj23_indices <- which(all_sample_names %in% proj23_sample_names)

if (length(proj23_indices) > 0) {
  proj23_list <- data_list[proj23_indices]
  
  # Create metadata - UPDATE group COLUMN WITH ACTUAL EXPERIMENTAL GROUPS
  proj23_meta <- data.frame(
    sample = proj23_sample_names,
    group = c("Sample_01", "Sample_02", "Sample_03", "Sample_04")  # TODO: Update with real group labels
  )
  
  proj23_obj <- process_project(
    seurat_list = proj23_list,
    project_name = "Project_19072_23",
    sample_metadata_df = proj23_meta,
    output_file = "proj23_harmony_processed.qsave",
    nmads = 3,  # Adjust: 2.5 = stricter, 5 = more lenient
    run_harmony = TRUE,
    harmony_group_by = "sample"
  )
} else {
  cat("No samples found for Project 19072-23.\n")
  cat("Looking for:", proj23_sample_names, "\n")
  cat("Available samples:", all_sample_names, "\n")
}
## 
## ========================================
## Processing Project: Project_19072_23 
## ========================================
## 
## --- Step 1: Automated QC with scuttle::isOutlier ---
## 
##   Processing 19072-23-01-01-01 with nmads = 3 ...
##     Adaptive thresholds calculated:
##       nCount_RNA: [217 - 2522402]
##       nFeature_RNA: [566 - 32886]
##       percent.mito: < 2.07
## 
##     Outliers identified:
##       Library size (low/high UMI): 0 ( 0 %)
##       Feature count (low/high genes): 368 ( 5.5 %)
##       High mitochondrial: 431 ( 6.5 %)
##       TOTAL outliers: 629 ( 9.5 %)
##       Cells to KEEP: 6021 ( 90.5 %)
## 
##   Processing 19072-23-02-01-01 with nmads = 3 ...
##     Adaptive thresholds calculated:
##       nCount_RNA: [30 - 5213341]
##       nFeature_RNA: [154 - 74539]
##       percent.mito: < 2.29
## 
##     Outliers identified:
##       Library size (low/high UMI): 0 ( 0 %)
##       Feature count (low/high genes): 0 ( 0 %)
##       High mitochondrial: 469 ( 8.3 %)
##       TOTAL outliers: 469 ( 8.3 %)
##       Cells to KEEP: 5191 ( 91.7 %)
## 
##   Processing 19072-23-03-01-01 with nmads = 3 ...
##     Adaptive thresholds calculated:
##       nCount_RNA: [2046 - 254699]
##       nFeature_RNA: [1395 - 12072]
##       percent.mito: < 1.79
## 
##     Outliers identified:
##       Library size (low/high UMI): 1257 ( 16.9 %)
##       Feature count (low/high genes): 1348 ( 18.1 %)
##       High mitochondrial: 454 ( 6.1 %)
##       TOTAL outliers: 1536 ( 20.7 %)
##       Cells to KEEP: 5896 ( 79.3 %)
## 
##   Processing 19072-23-04-01-01 with nmads = 3 ...
##     Adaptive thresholds calculated:
##       nCount_RNA: [3351 - 153402]
##       nFeature_RNA: [1785 - 9070]
##       percent.mito: < 1.7
## 
##     Outliers identified:
##       Library size (low/high UMI): 466 ( 5.8 %)
##       Feature count (low/high genes): 598 ( 7.4 %)
##       High mitochondrial: 440 ( 5.5 %)
##       TOTAL outliers: 756 ( 9.4 %)
##       Cells to KEEP: 7297 ( 90.6 %)
## 
## --- Step 2: Pre-filtering QC Summary ---
##                              Sample Cells_Before
## 19072-23-01-01-01 19072-23-01-01-01         6650
## 19072-23-02-01-01 19072-23-02-01-01         5660
## 19072-23-03-01-01 19072-23-03-01-01         7432
## 19072-23-04-01-01 19072-23-04-01-01         8053
## 
## --- Step 3: Filtering Outlier Cells ---
##   19072-23-01-01-01 : 6650 -> 6021 cells
##   19072-23-02-01-01 : 5660 -> 5191 cells
##   19072-23-03-01-01 : 7432 -> 5896 cells
##   19072-23-04-01-01 : 8053 -> 7297 cells
## 
## --- Step 4: Merging Samples ---
## Total cells after filtering: 24405 
## 
## --- Step 5: Adding Project Metadata ---
## --- Table for: group ---
## 
## Sample_01 Sample_02 Sample_03 Sample_04 
##      6021      5191      5896      7297 
## 
## --- Step 6: Post-filtering Statistics ---
## # A tibble: 4 × 5
##   sample            n_cells median_nFeature median_nCount median_mito
##   <chr>               <int>           <dbl>         <dbl>       <dbl>
## 1 19072-23-01-01-01    6021            4671        27762         0.81
## 2 19072-23-02-01-01    5191            3954        16982         0.99
## 3 19072-23-03-01-01    5896            4356        26744.        0.82
## 4 19072-23-04-01-01    7297            4126        24426         0.8 
## 
## --- Step 7: Generating QC Plots ---
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.

## 
## --- Step 8: Running Seurat Analysis Pipeline ---
##   Normalizing...
##   Finding variable features...
##   Scaling data (regressing out percent.mito)...
##   Running PCA...
## 
## --- Step 9: Running Harmony Batch Correction ---
##   Using Harmony reduction
## 
## --- Step 10: Running UMAP and Clustering ---
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session

##   Cluster counts:
##     Resolution 0.1 : 10 clusters
##     Resolution 0.2 : 13 clusters
##     Resolution 0.3 : 16 clusters
##     Resolution 0.5 : 17 clusters
##     Resolution 0.8 : 22 clusters
##     Resolution 1 : 25 clusters
## 
## --- Step 11: Generating UMAP Plots ---

## 
## --- Step 12: Checking Marker Genes ---
##   Markers found: Tnnt2, Myh6, Myh7, Col1a1, Col3a1, Dcn, Pecam1, Cdh5, Vwf, Ptprc, Cd68, Acta2, Myh11

## 
## --- Step 13: Saving processed object to: proj23_harmony_processed.qsave ---
## 
## ========================================
## Finished Project: Project_19072_23 
## Total cells: 24405 
## ========================================

6 5. Project 2: 19072-26

Samples: - 19072-26-01-01-01 - 19072-26-02-01-01 - 19072-26-03-01-01 - 19072-26-04-01-01

proj26_sample_names <- c("19072-26-01-01-01", "19072-26-02-01-01", 
                         "19072-26-03-01-01", "19072-26-04-01-01")

proj26_indices <- which(all_sample_names %in% proj26_sample_names)

if (length(proj26_indices) > 0) {
  proj26_list <- data_list[proj26_indices]
  
  # Create metadata - UPDATE group COLUMN WITH ACTUAL EXPERIMENTAL GROUPS
  proj26_meta <- data.frame(
    sample = proj26_sample_names,
    group = c("Sample_01", "Sample_02", "Sample_03", "Sample_04")  # TODO: Update with real group labels
  )
  
  proj26_obj <- process_project(
    seurat_list = proj26_list,
    project_name = "Project_19072_26",
    sample_metadata_df = proj26_meta,
    output_file = "proj26_harmony_processed.qsave",
    nmads = 3,  # Adjust: 2.5 = stricter, 5 = more lenient
    run_harmony = TRUE,
    harmony_group_by = "sample"
  )
} else {
  cat("No samples found for Project 19072-26.\n")
  cat("Looking for:", proj26_sample_names, "\n")
  cat("Available samples:", all_sample_names, "\n")
}
## 
## ========================================
## Processing Project: Project_19072_26 
## ========================================
## 
## --- Step 1: Automated QC with scuttle::isOutlier ---
## 
##   Processing 19072-26-01-01-01 with nmads = 3 ...
##     Adaptive thresholds calculated:
##       nCount_RNA: [215 - 2860]
##       nFeature_RNA: [138 - 1566]
##       percent.mito: < 9.44
## 
##     Outliers identified:
##       Library size (low/high UMI): 4096 ( 4.5 %)
##       Feature count (low/high genes): 4017 ( 4.4 %)
##       High mitochondrial: 176 ( 0.2 %)
##       TOTAL outliers: 4517 ( 5 %)
##       Cells to KEEP: 86722 ( 95 %)
## 
##   Processing 19072-26-02-01-01 with nmads = 3 ...
##     Adaptive thresholds calculated:
##       nCount_RNA: [253 - 4504]
##       nFeature_RNA: [165 - 2196]
##       percent.mito: < 10.58
## 
##     Outliers identified:
##       Library size (low/high UMI): 2074 ( 3.5 %)
##       Feature count (low/high genes): 1727 ( 2.9 %)
##       High mitochondrial: 123 ( 0.2 %)
##       TOTAL outliers: 2235 ( 3.8 %)
##       Cells to KEEP: 56642 ( 96.2 %)
## 
##   Processing 19072-26-03-01-01 with nmads = 3 ...
##     Adaptive thresholds calculated:
##       nCount_RNA: [182 - 14404]
##       nFeature_RNA: [112 - 554]
##       percent.mito: < 22.29
## 
##     Outliers identified:
##       Library size (low/high UMI): 3 ( 0.2 %)
##       Feature count (low/high genes): 134 ( 10.5 %)
##       High mitochondrial: 106 ( 8.3 %)
##       TOTAL outliers: 240 ( 18.8 %)
##       Cells to KEEP: 1039 ( 81.2 %)
## 
##   Processing 19072-26-04-01-01 with nmads = 3 ...
##     Adaptive thresholds calculated:
##       nCount_RNA: [305 - 3992]
##       nFeature_RNA: [200 - 2023]
##       percent.mito: < 10.58
## 
##     Outliers identified:
##       Library size (low/high UMI): 4584 ( 5.1 %)
##       Feature count (low/high genes): 4076 ( 4.6 %)
##       High mitochondrial: 59 ( 0.1 %)
##       TOTAL outliers: 4745 ( 5.3 %)
##       Cells to KEEP: 84285 ( 94.7 %)
## 
## --- Step 2: Pre-filtering QC Summary ---
##                              Sample Cells_Before
## 19072-26-01-01-01 19072-26-01-01-01        91239
## 19072-26-02-01-01 19072-26-02-01-01        58877
## 19072-26-03-01-01 19072-26-03-01-01         1279
## 19072-26-04-01-01 19072-26-04-01-01        89030
## 
## --- Step 3: Filtering Outlier Cells ---
##   19072-26-01-01-01 : 91239 -> 86722 cells
##   19072-26-02-01-01 : 58877 -> 56642 cells
##   19072-26-03-01-01 : 1279 -> 1039 cells
##   19072-26-04-01-01 : 89030 -> 84285 cells
## 
## --- Step 4: Merging Samples ---
## Total cells after filtering: 228688 
## 
## --- Step 5: Adding Project Metadata ---
## --- Table for: group ---
## 
## Sample_01 Sample_02 Sample_03 Sample_04 
##     86722     56642      1039     84285 
## 
## --- Step 6: Post-filtering Statistics ---
## # A tibble: 4 × 5
##   sample            n_cells median_nFeature median_nCount median_mito
##   <chr>               <int>           <dbl>         <dbl>       <dbl>
## 1 19072-26-01-01-01   86722             453           764        5.2 
## 2 19072-26-02-01-01   56642             589          1042        6.05
## 3 19072-26-03-01-01    1039             242          1431        3.83
## 4 19072-26-04-01-01   84285             619          1071        5.61
## 
## --- Step 7: Generating QC Plots ---
## Warning: Default search for "data" layer in "RNA" assay yielded no results;
## utilizing "counts" layer instead.
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## 
## --- Step 8: Running Seurat Analysis Pipeline ---
##   Normalizing...
##   Finding variable features...
##   Scaling data (regressing out percent.mito)...
##   Running PCA...
## 
## --- Step 9: Running Harmony Batch Correction ---
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 11434400)
##   Using Harmony reduction
## 
## --- Step 10: Running UMAP and Clustering ---
##   Cluster counts:
##     Resolution 0.1 : 20 clusters
##     Resolution 0.2 : 20 clusters
##     Resolution 0.3 : 21 clusters
##     Resolution 0.5 : 23 clusters
##     Resolution 0.8 : 27 clusters
##     Resolution 1 : 28 clusters
## 
## --- Step 11: Generating UMAP Plots ---
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## 
## --- Step 12: Checking Marker Genes ---
##   Markers found: Tnnt2, Myh6, Myh7, Col1a1, Col3a1, Dcn, Pecam1, Cdh5, Vwf, Ptprc, Cd68, Acta2, Myh11
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`
## Rasterizing points since number of points exceeds 100,000.
## To disable this behavior set `raster=FALSE`

## 
## --- Step 13: Saving processed object to: proj26_harmony_processed.qsave ---
## 
## ========================================
## Finished Project: Project_19072_26 
## Total cells: 228688 
## ========================================

7 Session Info

## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.7.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] Matrix_1.7-0                harmony_1.2.0              
##  [3] Rcpp_1.1.0                  Polychrome_1.5.1           
##  [5] qs_0.26.3                   here_1.0.1                 
##  [7] patchwork_1.2.0             ggplot2_3.5.0              
##  [9] dplyr_1.1.4                 cowplot_1.1.3              
## [11] Seurat_5.1.0                SeuratObject_5.0.2         
## [13] sp_2.1-4                    scuttle_1.14.0             
## [15] SingleCellExperiment_1.26.0 SummarizedExperiment_1.34.0
## [17] Biobase_2.64.0              GenomicRanges_1.56.1       
## [19] GenomeInfoDb_1.40.1         IRanges_2.38.1             
## [21] S4Vectors_0.42.1            BiocGenerics_0.50.0        
## [23] MatrixGenerics_1.16.0       matrixStats_1.3.0          
## 
## loaded via a namespace (and not attached):
##   [1] RcppAnnoy_0.0.22          splines_4.4.1            
##   [3] later_1.3.2               tibble_3.2.1             
##   [5] R.oo_1.26.0               polyclip_1.10-7          
##   [7] fastDummies_1.7.4         lifecycle_1.0.4          
##   [9] rprojroot_2.0.4           globals_0.16.3           
##  [11] lattice_0.22-6            MASS_7.3-60.2            
##  [13] magrittr_2.0.3            plotly_4.10.4            
##  [15] sass_0.4.9                rmarkdown_2.28           
##  [17] jquerylib_0.1.4           yaml_2.3.10              
##  [19] httpuv_1.6.15             sctransform_0.4.1        
##  [21] spam_2.10-0               spatstat.sparse_3.1-0    
##  [23] reticulate_1.42.0         pbapply_1.7-2            
##  [25] RColorBrewer_1.1-3        abind_1.4-5              
##  [27] zlibbioc_1.50.0           Rtsne_0.17               
##  [29] purrr_1.0.2               R.utils_2.12.3           
##  [31] GenomeInfoDbData_1.2.12   ggrepel_0.9.5            
##  [33] irlba_2.3.5.1             listenv_0.9.1            
##  [35] spatstat.utils_3.1-0      goftest_1.2-3            
##  [37] RSpectra_0.16-2           spatstat.random_3.3-1    
##  [39] fitdistrplus_1.2-1        parallelly_1.38.0        
##  [41] DelayedMatrixStats_1.26.0 leiden_0.4.3.1           
##  [43] codetools_0.2-20          DelayedArray_0.30.1      
##  [45] RApiSerialize_0.1.3       tidyselect_1.2.1         
##  [47] UCSC.utils_1.0.0          farver_2.1.2             
##  [49] spatstat.explore_3.3-1    jsonlite_1.8.8           
##  [51] progressr_0.14.0          ggridges_0.5.6           
##  [53] survival_3.6-4            tools_4.4.1              
##  [55] ica_1.0-3                 glue_1.8.0               
##  [57] gridExtra_2.3             SparseArray_1.4.8        
##  [59] xfun_0.52                 withr_3.0.1              
##  [61] BiocManager_1.30.26       fastmap_1.2.0            
##  [63] fansi_1.0.6               digest_0.6.36            
##  [65] R6_2.5.1                  mime_0.12                
##  [67] colorspace_2.1-1          scattermore_1.2          
##  [69] tensor_1.5                dichromat_2.0-0.1        
##  [71] spatstat.data_3.1-2       R.methodsS3_1.8.2        
##  [73] RhpcBLASctl_0.23-42       utf8_1.2.4               
##  [75] tidyr_1.3.1               generics_0.1.3           
##  [77] data.table_1.15.4         httr_1.4.7               
##  [79] htmlwidgets_1.6.4         S4Arrays_1.4.1           
##  [81] scatterplot3d_0.3-44      uwot_0.2.2               
##  [83] pkgconfig_2.0.3           gtable_0.3.6             
##  [85] lmtest_0.9-40             XVector_0.44.0           
##  [87] htmltools_0.5.8.1         dotCall64_1.1-1          
##  [89] scales_1.4.0              png_0.1-8                
##  [91] spatstat.univar_3.0-0     knitr_1.48               
##  [93] rstudioapi_0.16.0         reshape2_1.4.4           
##  [95] nlme_3.1-164              cachem_1.1.0             
##  [97] zoo_1.8-12                stringr_1.5.1            
##  [99] KernSmooth_2.23-24        vipor_0.4.7              
## [101] parallel_4.4.1            miniUI_0.1.1.1           
## [103] ggrastr_1.0.2             pillar_1.9.0             
## [105] grid_4.4.1                vctrs_0.6.5              
## [107] RANN_2.6.1                promises_1.3.0           
## [109] stringfish_0.16.0         beachmat_2.20.0          
## [111] xtable_1.8-4              cluster_2.1.6            
## [113] beeswarm_0.4.0            evaluate_0.24.0          
## [115] cli_3.6.3                 compiler_4.4.1           
## [117] rlang_1.1.4               crayon_1.5.3             
## [119] future.apply_1.11.2       labeling_0.4.3           
## [121] ggbeeswarm_0.7.2          plyr_1.8.9               
## [123] stringi_1.8.4             viridisLite_0.4.2        
## [125] deldir_2.0-4              BiocParallel_1.38.0      
## [127] lazyeval_0.2.2            spatstat.geom_3.3-2      
## [129] RcppHNSW_0.6.0            sparseMatrixStats_1.16.0 
## [131] future_1.34.0             shiny_1.9.1              
## [133] highr_0.11                ROCR_1.0-11              
## [135] igraph_2.0.3              RcppParallel_5.1.8       
## [137] bslib_0.8.0